##read in data file:
dat<-read.csv(file='W:/R Course (sponsored by BERD) 2020/Part II/Topic6/dat.csv',header=T)

##check variables:
summary(dat)

##analysis after removing missing values:
mean(dat$y1)
mean(dat$y1,na.rm=T)

var(dat$y1)
var(dat$y1,na.rm=T)

cor(dat$y1,dat$y2)
cor(dat$y1,dat$y2,use="complete.obs")

lm(y2~y1+x1+x2,data=dat)

##check missing pattern:
library(naniar)
vis_miss(dat)
gg_miss_upset(dat)
gg_miss_var(dat)

##check missing mechanism:
par(mfrow=c(3,3))
boxplot(dat$x1 ~ dat$r1, col="orange", main="Distribution of x1", ylab="x1", xlab="r1") 
boxplot(dat$x1 ~ dat$r2, col="red", main="Distribution of x1", ylab="x1", xlab="r2") 
boxplot(dat$x1 ~ dat$r3, col="blue", main="Distribution of x1", ylab="x1", xlab="r3") 
boxplot(dat$x2 ~ dat$r1, col="orange", main="Distribution of x2", ylab="x2", xlab="r1") 
boxplot(dat$x2 ~ dat$r2, col="red", main="Distribution of x2", ylab="x2", xlab="r2") 
boxplot(dat$x2 ~ dat$r3, col="blue", main="Distribution of x2", ylab="x2", xlab="r3") 
boxplot(dat$x3 ~ dat$r1, col="orange", main="Distribution of x3", ylab="x3", xlab="r1") 
boxplot(dat$x3 ~ dat$r2, col="red", main="Distribution of x3", ylab="x3", xlab="r2") 
boxplot(dat$x3 ~ dat$r3, col="blue", main="Distribution of x3", ylab="x3", xlab="r3") 

library(dplyr)
library(ggplot2)
library(tidyr)
library(scales)

ggplot(dat, aes(x= x4,  group=r1)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="x4") +
    facet_grid(~r1) +
    scale_y_continuous(labels = scales::percent)

ggplot(dat, aes(x= x4,  group=r2)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="x4") +
    facet_grid(~r2) +
    scale_y_continuous(labels = scales::percent)

ggplot(dat, aes(x= x4,  group=r3)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="x4") +
    facet_grid(~r3) +
    scale_y_continuous(labels = scales::percent)

ggplot(dat, aes(x= x5,  group=r1)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="x5") +
    facet_grid(~r1) +
    scale_y_continuous(labels = scales::percent)

ggplot(dat, aes(x= x5,  group=r2)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="x5") +
    facet_grid(~r2) +
    scale_y_continuous(labels = scales::percent)

ggplot(dat, aes(x= x5,  group=r3)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="x5") +
    facet_grid(~r3) +
    scale_y_continuous(labels = scales::percent)


fit1<-glm(r1~x1+x2+x3+x4+as.character(x5),data=dat,family=binomial(link = "logit"))
summary(fit1)
fit2<-glm(r2~x1+x2+x3+x4+as.character(x5),data=dat,family=binomial(link = "logit"))
summary(fit2)
fit3<-glm(r3~x1+x2+x3+x4+as.character(x5),data=dat,family=binomial(link = "logit"))
summary(fit3)

##imputation model check:
pairs(dat[,c(2,3,4,7,8)])

par(mfrow=c(2,3))
boxplot(dat$y1 ~ dat$x4, col="orange", main="Distribution of y1", ylab="y1", xlab="x4") 
boxplot(dat$y1 ~ dat$x5, col="blue", main="Distribution of y1", ylab="y1", xlab="x5")
boxplot(dat$y1 ~ dat$y3, col="red", main="Distribution of y1", ylab="y1", xlab="y3") 
boxplot(dat$y2 ~ dat$x4, col="orange", main="Distribution of y2", ylab="y2", xlab="x4") 
boxplot(dat$y2 ~ dat$x5, col="blue", main="Distribution of y2", ylab="y2", xlab="x5")
boxplot(dat$y2 ~ dat$y3, col="red", main="Distribution of y2", ylab="y2", xlab="y3") 

fit1_m<-glm(y1~x1+x2+x3+x4+as.character(x5),data=dat)
summary(fit1_m)

fit2_m<-glm(y2~x1+x2+x3+x4+as.character(x5)+y1,data=dat)
summary(fit2_m)

fit3_m<-glm(y3~x1+x2+x3+x4+as.character(x5)+y1+y2,data=dat,family=binomial(link = "logit"))
summary(fit3_m)

##imputation by using 'mice' package:
library(mice)
dat$x4<-as.factor(dat$x4)
dat$x5<-as.factor(dat$x5)
dat$y3<-as.factor(dat$y3)

dat2<-dat[,c(2:9)]
datm1<-mice(dat2)
datm2<-mice(dat2, meth = c("pmm","pmm","pmm","logreg","polyreg","pmm","pmm","logreg"))

PM<-rbind(c(0,rep(1,7)),c(1,0,rep(1,6)),c(1,1,0,rep(1,5)),c(rep(1,3),0,rep(1,4)),
c(rep(1,4),0,rep(1,3)),c(rep(1,4),rep(0,4)),c(rep(1,4),0,1,0,0),c(1,1,1,0,0,1,1,0))
datm3<-mice(dat2,meth=c("pmm","pmm","pmm","logreg","polyreg","pmm","pmm","logreg"),
predictorMatrix=PM)

##output:
attributes(datm3)
datm3$predictorMatrix

##pooled analysis by using Rubin's rule:
fit <- with(data = datm3, exp = lm(y1 ~ x1 + x2 + x3 + x4))
summary(pool(fit))

fit <- with(data = datm3, exp = glm(y3 ~ x1 + x2 + x3 + x4 + y1 + y2,family=binomial(link = "logit")))
summary(pool(fit))

fit <- with(data = datm3, exp = lm(y1~1))
summary(pool(fit))

##Propensity score weighting:

#estimate propensity scores for responding:
pfit1<-glm(r1~x1+x2+x3+x4+x5,data=dat,family=binomial(link = "logit"))
ep1<-pfit1$fitted.values
ew1<-1/ep1

pfit2<-glm(r2~x1+x2+x3+x4+x5,data=dat,family=binomial(link = "logit"))
ep2<-pfit2$fitted.values
ew2<-1/ep2

pfit3<-glm(r3~x1+x2+x3+x4+x5,data=dat,family=binomial(link = "logit"))
ep3<-pfit3$fitted.values
ew3<-1/ep3

dat3<-cbind(dat,ew1,ew2,ew3)

##weighted analysis:
a1<-glm(y1~x1+x2+x3+x4,data=dat3,weights=ew1)
summary(a1)

a2<-glm(y2~x1+x2+x3+x4,data=dat3,weights=ew2)
summary(a2)

a3<-glm(y3~x1+x2+x3+x4,data=dat3,weights=ew3,family=binomial(link = "logit"))
summary(a3)